# function for fixing legend
make_nice_effect_quantiles <- function(df, invar, outvar) {
  mutate(
    df,
    {{ outvar }} := recode({{ invar }}, 
                           effect_20 = "Effect at 20% quantile",
                            effect_50 = "Effect at 50% quantile",
                            effect_80 = "Effect at 80% quantile")
    )
}

effect_scale <- list(
  scale_color_manual(values = c(
    "Effect at 20% quantile" = colorspace::lighten("#007FA8", .7),
    "Effect at 50% quantile" = colorspace::lighten("#007FA8", .4),
    "Effect at 80% quantile" = "#007FA8"
  ))
)
hm <- read_rds("final_models/17-brm-large-sample.rds.bz2")

hm_simple <- brm(file = "final_models/hm_final_rerun.rds", file_refit = "never")

Posterior predictive check

pp_check(hm, ndraws = 10, cores = mc_cores) +
  coord_cartesian(xlim = c(0, 8000))

pred_vis <- function(df, model, country_selection,
                     var_for_offset = base$P_top10, alpha = 1, ndraws = 1000) {
  scale_offset <- attributes(var_for_offset)[["scaled:center"]]
  get_back <- function(df) mutate(df, P_top10 = exp(scale_offset + P_top10))
  
  df %>%
    filter(country == country_selection) %>%
    modelr::data_grid(P_top10, country, field) %>%
    add_predicted_draws(model, ndraws = ndraws, re_formula = NULL, 
                        cores = mc_cores) %>%
    get_back() %>% 
    ggplot(aes(P_top10, .prediction)) +
    stat_interval() +
    scale_color_manual(values = colorspace::lighten(clrs[4], c(.8, .67, .42))) +
    scale_y_continuous(labels = dollar) +
    geom_jitter(aes(y = APC_in_dollar), alpha = alpha, 
                position = position_jitter(width = 5, height = 50),
                data = filter(df, country == country_selection) %>% get_back()) +
    facet_wrap(vars(field)) +
    labs(y = "Predicted vs. actual APC", x = expression(P["top 10%"]),
         color = "Credible interval") +
    # theme_minimal(base_family = "Hind") +
    theme_clean() +
    theme(legend.position = "bottom", panel.grid.minor = element_blank())
}
pred_vis(base, hm, "Austria", alpha = .5)

pred_vis(base, hm, "Brazil", alpha = .2)

This updated model fares much better for Brazil. The predictions are still not ideal (underestimating higher APCs of ~2000), but overall much better than the previous model.

pred_vis(base, hm, "China", alpha = .15)

pred_vis(base, hm, "United States", alpha = .2)

pred_vis(base, hm, "Turkey", alpha = .7)

PP check for simple model

pp_check(hm_simple, ndraws = 10, cores = mc_cores) +
  coord_cartesian(xlim = c(0, 8000))
## Warning: The following arguments were unrecognized and ignored: cores
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

pred_vis(base, hm_simple, "Brazil", alpha = .2)

Model variances and covariances

summary(hm) 
##  Family: mixture(hurdle_lognormal, hurdle_lognormal) 
##   Links: mu1 = identity; sigma1 = identity; hu1 = logit; mu2 = identity; sigma2 = identity; hu2 = logit; theta1 = identity; theta2 = identity 
## Formula: APC_in_dollar | weights(total_weight) ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country) 
##          hu1 ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country)
##          hu2 ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country)
##          theta1 ~ 1 + (1 | field)
##    Data: base (Number of observations: 188614) 
##   Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 4000
## 
## Group-Level Effects: 
## ~country (Number of levels: 69) 
##                                Estimate Est.Error l-95% CI u-95% CI Rhat
## sd(mu1_Intercept)                  0.45      0.05     0.36     0.56 1.00
## sd(mu1_P_top10)                    0.03      0.02     0.00     0.08 1.00
## sd(mu2_Intercept)                  0.03      0.00     0.02     0.04 1.00
## sd(mu2_P_top10)                    0.00      0.00     0.00     0.01 1.00
## sd(hu1_Intercept)                  1.39      0.22     1.01     1.88 1.00
## sd(hu1_P_top10)                    0.11      0.08     0.01     0.32 1.00
## sd(hu2_Intercept)                  1.41      0.18     1.10     1.78 1.00
## sd(hu2_P_top10)                    0.17      0.11     0.01     0.40 1.01
## cor(mu1_Intercept,mu1_P_top10)    -0.12      0.30    -0.65     0.51 1.00
## cor(mu2_Intercept,mu2_P_top10)     0.05      0.32    -0.57     0.65 1.00
## cor(hu1_Intercept,hu1_P_top10)    -0.18      0.35    -0.77     0.52 1.00
## cor(hu2_Intercept,hu2_P_top10)    -0.03      0.27    -0.56     0.48 1.00
##                                Bulk_ESS Tail_ESS
## sd(mu1_Intercept)                  1365     1847
## sd(mu1_P_top10)                    1381     2230
## sd(mu2_Intercept)                  1369     2293
## sd(mu2_P_top10)                    2059     2300
## sd(hu1_Intercept)                  1478     2314
## sd(hu1_P_top10)                    1465     2330
## sd(hu2_Intercept)                  1165     1960
## sd(hu2_P_top10)                     596     1609
## cor(mu1_Intercept,mu1_P_top10)     5914     3058
## cor(mu2_Intercept,mu2_P_top10)     5158     3106
## cor(hu1_Intercept,hu1_P_top10)     4486     3248
## cor(hu2_Intercept,hu2_P_top10)     4139     2525
## 
## ~field (Number of levels: 19) 
##                                Estimate Est.Error l-95% CI u-95% CI Rhat
## sd(mu1_Intercept)                  0.26      0.06     0.17     0.40 1.00
## sd(mu1_P_top10)                    0.07      0.03     0.02     0.13 1.00
## sd(mu2_Intercept)                  0.28      0.06     0.18     0.43 1.00
## sd(mu2_P_top10)                    0.01      0.01     0.00     0.03 1.00
## sd(hu1_Intercept)                  1.61      0.27     1.17     2.23 1.00
## sd(hu1_P_top10)                    0.11      0.06     0.01     0.25 1.00
## sd(hu2_Intercept)                  1.99      0.37     1.33     2.76 1.00
## sd(hu2_P_top10)                    0.43      0.11     0.27     0.68 1.00
## sd(theta1_Intercept)               0.94      0.28     0.54     1.61 1.00
## cor(mu1_Intercept,mu1_P_top10)     0.05      0.27    -0.46     0.57 1.00
## cor(mu2_Intercept,mu2_P_top10)    -0.10      0.31    -0.67     0.51 1.00
## cor(hu1_Intercept,hu1_P_top10)    -0.26      0.30    -0.76     0.39 1.00
## cor(hu2_Intercept,hu2_P_top10)     0.23      0.24    -0.27     0.64 1.00
##                                Bulk_ESS Tail_ESS
## sd(mu1_Intercept)                  1511     2541
## sd(mu1_P_top10)                    1932     2225
## sd(mu2_Intercept)                   914     1563
## sd(mu2_P_top10)                    1123     1742
## sd(hu1_Intercept)                  1229     2190
## sd(hu1_P_top10)                    1837     1538
## sd(hu2_Intercept)                  1034     1240
## sd(hu2_P_top10)                    1812     2479
## sd(theta1_Intercept)                854     1385
## cor(mu1_Intercept,mu1_P_top10)     4179     3157
## cor(mu2_Intercept,mu2_P_top10)     4872     3276
## cor(hu1_Intercept,hu1_P_top10)     5363     2940
## cor(hu2_Intercept,hu2_P_top10)     3031     3177
## 
## Population-Level Effects: 
##                  Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## mu1_Intercept        6.79      0.09     6.60     6.97 1.01      985     1857
## hu1_Intercept       -0.58      0.39    -1.42     0.14 1.01      602     1008
## mu2_Intercept        7.51      0.07     7.37     7.63 1.01      845     1360
## hu2_Intercept        0.30      0.43    -0.47     1.20 1.00      869     1277
## theta1_Intercept    -0.28      0.29    -0.79     0.36 1.00      731      830
## mu1_P_top10          0.18      0.03     0.12     0.24 1.00     3116     2850
## hu1_P_top10         -0.02      0.09    -0.18     0.16 1.00     2864     2389
## mu2_P_top10          0.00      0.01    -0.01     0.02 1.00     2782     2579
## hu2_P_top10         -0.18      0.15    -0.48     0.11 1.00     1854     2162
## 
## Family Specific Parameters: 
##        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma1     0.85      0.01     0.83     0.87 1.00     6525     3222
## sigma2     0.20      0.00     0.20     0.20 1.00     6132     3272
## 
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

“Marginal effects”

Here we compute marginal effects manually, by making predictions for a given x (P_top10) and then the same x * 1.01, i.e., increasing x (on the original scale) by 1%, and then comparing the predictions.

Fields

scale_offset <- attributes(base$P_top10)[["scaled:center"]]
x1_identity <- 1500
x2_identity <- x1_identity * 1.1
x1_log <- log(x1_identity) - scale_offset
x2_log <- log(x2_identity) - scale_offset


contrast <- predictions(
  hm,
  newdata = datagrid(P_top10 = c(x1_log, x2_log),
                     country = "Brazil", field = unique(base$field))) %>% 
  posteriordraws()
  
contrast_recomputed <- contrast %>% 
  mutate(x = factor(P_top10, labels = c("base", "step"))) %>% 
  pivot_wider(-c(predicted, conf.low, conf.high, P_top10, rowid), 
              names_from = x, values_from = draw) %>% 
  mutate(contrast = step / base - 1)
contrast_recomputed %>% 
  ggplot(aes(contrast, fct_reorder(field, contrast))) +
  stat_halfeye() +
  scale_x_continuous(labels = percent)

This is very sensitive to the respective country. Maybe we can recompute an average marginal effect after all?

average_draws <- function(model, orig_var, q = .5, 
                                var_for_offset = base$P_top10) {
  scale_offset <- attributes(var_for_offset)[["scaled:center"]]
  
  x1_identity <- quantile(orig_var, q)
  x2_identity <- x1_identity * 1.01
  x1_log <- log(x1_identity) - scale_offset
  x2_log <- log(x2_identity) - scale_offset
  
  
  contrast_all <- predictions(
    model,
    newdata = datagrid(P_top10 = c(x1_log, x2_log),
                       country = unique(base$country), 
                       field = unique(base$field))) %>% 
    posteriordraws()
    
  contrast_all %>% 
    mutate(x = factor(P_top10, labels = c("base", "step"))) %>% 
    pivot_wider(-c(predicted, conf.low, conf.high, P_top10, rowid), 
                names_from = x, values_from = draw) %>% 
    mutate(contrast = step / base - 1)
}

summarise_by <- function(contrast_df, var = field) {
  contrast_df %>% 
    group_by({{ var }}, drawid) %>% 
    summarise(effect = mean(contrast))
}
plot_effect <- function(contrast_df, location = "the median") {
  contrast_df %>% 
    ggplot(aes(effect, fct_reorder(field, effect))) +
    stat_halfeye(.width = c(.5, .9), point_interval = "median_hdi") +
    scale_x_continuous(labels = percent) +
    labs(
      y = NULL, 
      x = glue::glue("% change of APC for 1% change of P_top10% at {location}"),
      caption = "Averaged predictions over all countries.") +
    theme_clean() +
    coord_cartesian(xlim = c(-0.005, 0.005))
}
contrast_20 <- average_draws(hm, df$P_top10, q = .2)
contrast_50 <- average_draws(hm, df$P_top10, q = .5)
contrast_80 <- average_draws(hm, df$P_top10, q = .8)
contrast_20_field <- summarise_by(contrast_20, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_20_field %>% 
  plot_effect("the 20% quantile")

This seems reasonable, but would need to further validate.

At 50%

contrast_50_field <- summarise_by(contrast_50, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_50_field %>% 
  plot_effect()

contrast_80_field <- summarise_by(contrast_80, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_80_field %>% 
  plot_effect()

Compare all three

all_joined <- bind_rows(
  rename(contrast_50_field, effect_50 = effect),
  rename(contrast_80_field, effect_80 = effect),
  rename(contrast_20_field, effect_20 = effect)
)
p <- all_joined %>% 
  pivot_longer(contains("effect"), values_to = "effect") %>% 
  drop_na() %>% 
  make_nice_effect_quantiles(name, name) %>% 
  ggplot(aes(effect, fct_reorder(field, effect), colour = name)) +
  geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
  stat_pointinterval(position = position_dodge(width = .5),
                     .width = c(.5, .9)) +
  scale_x_continuous(labels = percent) +
  labs(
    y = NULL, 
    x = expression(paste("% change of APC for 1% change of ", P["top 10%"], 
                         " at given quantiles")),
    caption = "Predictions averaged over all countries.") +
  theme_clean() +
  coord_cartesian(xlim = c(-0.005, 0.005)) +
  guides(colour = guide_legend(reverse = FALSE))
p + 
  effect_scale +
  theme(legend.position = "top", legend.justification = c(1, 0)) + 
  labs(colour = NULL) 

Questions that arise: - why in some fields stronger/weaker effect for larger/smaller P_top10? Is this also associated with hurdle component? - Especially: why physics and mathematics negative? because of hurdle? -> Yes

Need to put into context of overall averages: give average per field (from model or full set)

Intercept at median

contrast_50 %>% 
  group_by(field, drawid) %>% 
  summarise(intercept = mean(base)) %>% 
  ggplot(aes(intercept, fct_reorder(field, intercept))) +
  stat_halfeye(.width = c(.5, .9), fill = colorspace::lighten("#007FA8"),
               point_interval = "median_hdi") +
  scale_x_continuous(labels = scales::dollar) +
  theme_clean() +
  labs(y = NULL, x = expression(paste("Estimated APC at median of ",
                                      P["top 10%"])))
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.

Countries

contrast_20_country <- summarise_by(contrast_20, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
contrast_50_country <- summarise_by(contrast_50, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
contrast_80_country <- summarise_by(contrast_80, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
all_countries <- bind_rows(
  rename(contrast_20_country, effect_20 = effect),
  rename(contrast_50_country, effect_50 = effect),
  rename(contrast_80_country, effect_80 = effect),
) %>% 
  pivot_longer(contains("effect"), values_to = "effect") %>% 
  drop_na()
p_country <- all_countries %>% 
  ggplot(aes(effect, fct_reorder(country, effect), colour = name)) +
  geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
  stat_pointinterval(position = position_dodge(width = .5),
                     .width = c(.5, .9), point_interval = "median_hdi") +
  scale_x_continuous(labels = percent) +
  labs(
    y = NULL, 
    x = glue::glue("% Change of APC for 1% change of P_top10% at given quantiles"),
    caption = "Averaged predictions over all countries.") +
  theme_clean() +
  coord_cartesian(xlim = c(-0.001, 0.005)) +
  guides(colour = guide_legend(reverse = TRUE))
p_country + scale_color_manual(values = c(
    effect_20 = colorspace::lighten("#007FA8", .7),
    effect_50 = colorspace::lighten("#007FA8", .4),
    effect_80 = "#007FA8"
  ))

Group by continent

country_identifier <- base %>% 
  distinct(country, region, country_code) %>% 
  drop_na()

all_countries_w_region <- all_countries %>% 
  left_join(country_identifier)
## Joining, by = "country"
plot_countries_by_region <- function(df) {
  df %>% 
    make_nice_effect_quantiles(name, color_label) %>% 
    ggplot(aes(effect, fct_reorder(country, effect), colour = color_label)) +
    geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
    stat_pointinterval(position = position_dodge(width = .5),
                       .width = c(.5, .9), point_interval = "median_hdi") +
    scale_x_continuous(labels = percent) +
    labs(
      y = NULL, 
      x = expression(paste("% Change of APC for 1% change of ", P["top 10%"], 
                         " at given quantiles")),
      caption = "Predictions averaged over all fields.",
      colour = NULL) +
    theme_clean() +
    facet_grid(rows = vars(str_wrap(region, 9)), space = "free_y", scales = "free_y") +
    coord_cartesian(xlim = c(-0.001, 0.005)) +
    guides(colour = guide_legend(reverse = FALSE, override.aes = list(size = 2))) +
    theme(legend.position = "top") +
    effect_scale
}

p_europe_subsahara <- all_countries_w_region %>% 
  filter(str_detect(region, "Europe|Sahara")) %>% 
  plot_countries_by_region()

p_rest <- all_countries_w_region %>% 
  filter(!str_detect(region, "Europe|Sahara")) %>% 
  plot_countries_by_region()
p_europe_subsahara

p_rest

Relate to gdp

gdp <- WDI::WDI(start = 2019, end = 2019)

country_effect_with_gdp <- all_countries_w_region %>% 
  left_join(gdp, by = c("country_code" = "iso2c"))
country_effect_with_gdp
## # A tibble: 828,000 x 9
##    country.x drawid name     effect region country_code country.y NY.GDP.PCAP.KD
##    <chr>     <fct>  <chr>     <dbl> <chr>  <chr>        <chr>              <dbl>
##  1 Algeria   1      effect~ 0.00297 Middl~ DZ           Algeria            4115.
##  2 Algeria   2      effect~ 0.00103 Middl~ DZ           Algeria            4115.
##  3 Algeria   3      effect~ 0.00362 Middl~ DZ           Algeria            4115.
##  4 Algeria   4      effect~ 0.00302 Middl~ DZ           Algeria            4115.
##  5 Algeria   5      effect~ 0.00224 Middl~ DZ           Algeria            4115.
##  6 Algeria   6      effect~ 0.00217 Middl~ DZ           Algeria            4115.
##  7 Algeria   7      effect~ 0.00162 Middl~ DZ           Algeria            4115.
##  8 Algeria   8      effect~ 0.00148 Middl~ DZ           Algeria            4115.
##  9 Algeria   9      effect~ 0.00125 Middl~ DZ           Algeria            4115.
## 10 Algeria   10     effect~ 0.00122 Middl~ DZ           Algeria            4115.
## # ... with 827,990 more rows, and 1 more variable: year <int>
p <- country_effect_with_gdp %>% 
  filter(name == "effect_50") %>% 
  rename(gdp = NY.GDP.PCAP.KD) %>% 
  group_by(country_code, country.x, gdp, region) %>% 
  point_interval(effect, .width = .5, .point = median, .interval = hdi) %>% 
  ggplot(aes(gdp, effect, colour = region, label = country.x)) +
  geom_pointrange(aes(ymin = .lower, ymax = .upper)) +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::comma) +
  scale_color_discrete_qualitative(palette = "Dark 3") +
  labs(colour = NULL, x = "GDP per capita", 
       y = "% increase in APC for 1% increase of P_top10 at the median") +
  theme_clean() +
  theme(legend.position = "top")  +
  coord_cartesian(ylim = c(0, .003))
p
## Warning: Removed 1 rows containing missing values (geom_pointrange).

plotly::ggplotly(p)

This could still be improved by adding number of universities (or papers) as a size variable.

unis_p_country <- df %>% 
  distinct(country, University) %>% 
  count(country, name = "n_universities") 
pdata <- country_effect_with_gdp %>% 
  filter(name == "effect_50") %>% 
  rename(gdp = NY.GDP.PCAP.KD) %>% 
  group_by(country_code, country.x, gdp, region) %>% 
  point_interval(effect, .width = .5, .point = median, .interval = hdi) %>% 
  left_join(unis_p_country, by = c("country.x" = "country"))

labels <- pdata %>% 
  mutate(label = case_when(
    country.x %in% c("China", "India", "Iran", "Germany", "United States",
                      "Brazil", "Luxembourg", "Czech Republic") ~ country.x,
    TRUE ~ ""))

pdata %>% 
  ggplot(aes(gdp, effect, colour = region, label = "")) +
  geom_linerange(aes(ymin = .lower, ymax = .upper, alpha = n_universities)) +
  geom_point(aes(alpha = n_universities), size = 2) +
  ggrepel::geom_text_repel(data = labels, aes(label = label),
                            show.legend = FALSE, max.overlaps = Inf,
                           box.padding = 1, min.segment.length = 0) +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::comma) +
  scale_color_discrete_qualitative(palette = "Dark 3") +
  # scale_size_continuous(trans = "sqrt") +
  scale_alpha_continuous(range = c(.2, 1), trans = "log10") +
  labs(colour = "Region", x = "GDP per capita", alpha = "Number of universities",
       y = expression(
         paste("% increase in APC for 1% increase of ", P["top 10%"], 
               " at the median")
         )) +
  theme_clean() +
  theme(legend.position = "top", legend.box = "vertical")  +
  coord_cartesian(ylim = c(0, .003))
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).

contrast_50 %>% 
  group_by(country, drawid) %>% 
  summarise(intercept = mean(base)) %>% 
  ggplot(aes(intercept, fct_reorder(country, intercept))) +
  stat_halfeye(.width = c(.5, .9), fill = colorspace::lighten("#007FA8"),
               point_interval = "median_hdi") +
  scale_x_continuous(labels = scales::dollar) +
  theme_clean() +
  labs(y = NULL, x = expression(paste("Estimated APC at median of ",
                                      P["top 10%"])))
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.

Not sure whether this is helpful. What we show is quite abstract (APC at hypothetical level of P_top10, averaged across all fields (weighting equally).

The same would apply to the field intercepts above.